*********************************************************************************
*This file cleans data of Chinese imports from China of the UN Comtrade database*
*********************************************************************************

*Import file, generate variable with state name using file name, rename variables, save files and append them

*CHINA
cd "$raw_data_imports/china shock"
set more off

local allfiles : dir "$raw_data_imports/china shock" files "comt*" 
tempfile temp_china
clear
save temp_china, emptyok replace
foreach file of local allfiles {
    import delimit using `file', varnames(1) clear
	quiet tostring *, replace force
    quiet append using temp_china
    save temp_china, replace
}

gen partner_class = "china"

drop period-tradeflowcode
drop reportercode reporteriso partnercode 
drop partneriso-modeoftransport
drop qtyunitcode-grossweightkg
drop ciftradevalueus-flag
drop tradeflow
rename tradevalueus imports

destring year imports, replace

gen country = "us" if reporter == "USA"
replace country = "ger" if reporter == "Germany"
replace country = "ch" if reporter == "Switzerland"
replace country = "nz" if reporter == "New Zealand"
replace country = "jp" if reporter == "Japan"
replace country = "es" if reporter == "Spain"
replace country = "dk" if reporter == "Denmark"
replace country = "fi" if reporter == "Finland"
replace country = "aus" if reporter == "Australia"

egen group = group(country year commoditycode imports)
collapse (firstnm) country year reporter partner imports commoditycode partner_class, by(group)
drop group


save "$clean_data_imports/china_imports_raw", replace
rm "$raw_data_imports/china shock/temp_china.dta"


*GENERATE ONE STATA DATASET FOR EACH COUNTRY (US, CH, GER, ...)
set more off
use "$clean_data_imports/china_imports_raw", clear
levelsof country, local(levels) 
foreach s of local levels {
	use "$clean_data_imports/china_imports_raw", clear
	keep if country == "`s'"
	sort year partner commoditycode
	save "$clean_data_imports/comtrade_`s'", replace
}

*******************************************************************************************
*							HS standardization (to HS2002)
*******************************************************************************************
*Standardize everything to HS02 such that afterwards there is a perfect match with Dorn's crosswalk to SIC
foreach country in us ch ger es jp nz dk fi aus {

	use "$clean_data_imports/comtrade_`country'", clear

	*Replace this code which is not recognized otherwise in Dorn's classification
	replace commoditycode = "640419" if commoditycode == "640320"

	replace commoditycode = "0" + commoditycode if length(commoditycode) == 5

	gen hs17  = commoditycode
	gen hs12  = commoditycode
	gen hs07  = commoditycode

	save "$clean_data_imports/comtrade_`country'", replace

	*HS17 to HS12
	import excel using "$raw_data_imports/china shock/hs17 to hs12.xlsx", firstrow clear
	replace hs17 = "0" + hs17 if length(hs17) == 5
	replace hs12 = "0" + hs12 if length(hs12) == 5

	merge 1:m hs17 using "$clean_data_imports/comtrade_`country'"
	replace commoditycode = hs12 if _merge == 3 
	drop if _merge==1
	drop _merge

	egen group = group(country year commoditycode)
	collapse (firstnm) country year reporter partner commoditycode partner_class hs12 hs07 (sum) imports, by(group)
	drop group

	save "$clean_data_imports/comtrade_`country'", replace

	*HS12 to HS07
	import excel using "$raw_data_imports/china shock/hs12 to hs07.xlsx", firstrow clear
	replace hs12 = "0" + hs12 if length(hs12) == 5
	replace hs07 = "0" + hs07 if length(hs07) == 5

	merge 1:m hs12 using "$clean_data_imports/comtrade_`country'"
	replace commoditycode = hs07 if _merge == 3 
	drop if _merge==1
	drop _merge

	egen group = group(country year commoditycode)
	collapse (firstnm) country year reporter partner commoditycode partner_class hs07 (sum) imports, by(group)
	drop group

	save "$clean_data_imports/comtrade_`country'", replace

	*HS07 to HS02
	import excel using "$raw_data_imports/china shock/hs07 to hs02.xlsx", firstrow clear
	replace hs07 = "0" + hs07 if length(hs07) == 5
	replace hs02 = "0" + hs02 if length(hs02) == 5

	merge 1:m hs07 using "$clean_data_imports/comtrade_`country'"
	replace commoditycode = hs02 if _merge == 3 
	drop if _merge==1
	drop _merge

	egen group = group(country year commoditycode)
	collapse (firstnm) country year reporter partner commoditycode partner_class (sum) imports, by(group)
	drop group

	save "$clean_data_imports/comtrade_`country'", replace
}

*******************************************************************************************
*	Employment by czones (SIC, from Autor, Dorn, Hanson, Acemoglu 2015) & Brendan Price
*******************************************************************************************
clear all
set more off

use "$contributed_data/cbp_czone_1991.dta", clear

keep if year == 1991
rename sic87dd sic

rename year year_emp

*Combine sectors that are not in manufacturing in aggregate sectors
replace sic = 100 if sic <999
replace sic = 1000 if sic>=1000&sic<=1499
replace sic = 1500 if sic>=1500&sic<=1799
replace sic = 4000 if sic>=4000&sic<=4999
replace sic = 5000 if sic>=5000&sic<=5199
replace sic = 6000 if sic>=6000&sic<=6799
replace sic = 7000 if sic>=7000&sic<=8999
drop if sic>=9000 //only interested in employment from private sector afterwards (no public administration needed)

gen industry = "agriculture" if sic == 100
replace industry = "mining" if sic == 1000
replace industry = "construction" if sic == 1500
replace industry = "manufacturing" if sic >= 2000 & sic<=3999
replace industry = "utilities" if sic == 4000
replace industry = "wholesale" if sic == 5000
replace industry = "retail" if sic == 5200
replace industry = "finance" if sic == 6000
replace industry = "services" if sic == 7000

collapse (firstnm) year_emp industry (sum) emp, by(sic czone)

gen sic87dd = sic

save "$clean_data_imports/employment_czone_industry_final", replace

*Only MANUFACTURING sector
use "$clean_data_imports/employment_czone_industry_final", clear
keep if sic>=2010 & sic<=3999

by sic, sort: gen nvals = _n == 1 
count if nvals
*392 industries
drop nvals

save "$clean_data_imports/employment_czone_industry_final_manufacturing", replace

*Dataset extensions over all years between 1993 and 2017
use "$clean_data_imports/employment_czone_industry_final", clear

*Need to expand employment dataset such that I have one obs for each sic, cz and year (repeated values of employment in 1990)
*for merge with import data by years for entire us

forval i=1993/2017 {
gen year`i' = emp
}

drop emp

reshape long year, i(czone sic year_emp) j(emp)
rename (year emp) (employment1991 year)

save "$clean_data_imports/employment_czone_industry_final_years", replace


*******************************************************************************************
*								Imports conversion (HS to SIC)
*******************************************************************************************

/*Need to create dataset with all combinations of country, year, hs (and with more than one hs if the same 
hs matches to different sic) to match crosswalk using m:1 with imports*/
*import delimit "01_Temp/04_conversion_hs_to_sic/tables/combi.csv", varnames(1) clear
import delimit "$raw_data_imports/china shock/id_crosswalk.csv", varnames(1) clear
cap rename ïid id
replace partner_class = "" if partner_class!="china" //only interested in China
fillin partner_class year id 
drop _fillin hs6 sic87dd share
drop if partner_class=="" | year==.
save "$clean_data_imports/all_combis.dta", replace

import delimit "$raw_data_imports/china shock/id_crosswalk.csv", varnames(1) clear
cap rename ïid id
drop partner_class year
merge 1:m id using "$clean_data_imports/all_combis.dta" 
drop _merge 

sort id partner_class year
save "$clean_data_imports/hs_sic_partner_year", replace

tostring hs6 sic87dd, replace
rename hs6 hs
replace hs = "0" + hs if length(hs) == 5
replace sic87dd = "0" + sic87dd if length(sic87dd) == 3
drop if sic87dd == "." //values that are missing in Dorn's crosswalk
save "$clean_data_imports/hs_sic_partner_year", replace

*Imports of countries expressed in sic
*(if no merge because of using, no problem, since from all combinations, there may not
*be imports for this combination)
*those who do not merge in the master, do not merge because the code is not in 
*Dorn's crosswalk list
set more off
foreach country in us ch ger es jp nz dk fi aus {
		use "$clean_data_imports/comtrade_`country'", clear
		rename commoditycode hs
		replace hs = "0" + hs if length(hs) == 5
		replace hs = "999999" if hs == "9999AA"
		drop if hs == "0TOTAL"
		sort year hs
	
		*Converge HS to SIC
		drop partner
		sort hs partner year
		drop if hs == "999999"
		*Drop duplicates from egen function
		bysort hs year: gen d = cond(_N==1,0,_n)
		drop if d>1
		drop d
		
		merge 1:m hs year using "$clean_data_imports/hs_sic_partner_year"
		replace sic87dd = "2911" if hs == "271013"|hs == "271014"|hs == "271016"|hs == "271021"|hs == "271022"|hs == "271025"|hs == "271026"|hs ==  "271027"|hs == "271029"|hs == "271113"|hs == "271115"|hs == "271116"
		*Example: still 1905 codes for which there are no imports from China
		*drop if imports==.
		replace imports = 0 if mi(imports)
		drop _merge

		*Total imports by sic and year (when same HS but different SIC, imports flow into SIC using shares by Dorn)
		bysort sic87dd year: egen import_industry = sum(import*share) if !mi(import)

		egen group = group(sic87dd year)
		collapse (firstnm) country year reporter partner_class sic87dd import_industry, by(group)
		drop group
		rename sic87dd sic
		destring sic, replace
		
		*Merge a few categories by hand
		replace sic = 2066 if sic == 2067
		replace sic = 2252 if sic == 2253
		replace sic = 2341 if sic == 2342
		replace sic = 3291 if sic == 3292
		replace sic = 3339 if sic == 3341
		
		egen group = group(sic year)
		collapse (firstnm) country year reporter partner_class sic (sum) import_industry, by(group)
		drop group
		
		save "$clean_data_imports/c_comtrade_`country'", replace
}


*Fill in missings in reporter and country
set more off
foreach country in us ch ger es jp nz dk fi aus {
	use "$clean_data_imports/c_comtrade_`country'", clear
	gsort -country
	replace country = country[_n-1] if mi(country)
	replace reporter = reporter[_n-1] if mi(reporter)
	sort year sic
	save "$clean_data_imports/c_comtrade_`country'", replace
}

*******************************
*Inflate imports to 2017 prices
*******************************

set more off
foreach country in us ch ger es jp nz dk fi aus {
	import delimit "$raw_data_imports/china shock/PCE.csv", clear
	drop if year < 1993
	merge 1:m year using "$clean_data_imports/c_comtrade_`country'.dta"
	gen import_industry1 = import_industry/100*base_2017
	drop import_industry base_* _merge
	rename import_industry1 imports
	sort partner_class year sic
	save "$clean_data_imports/c_comtrade_`country'_1.dta", replace

}

*
*******************************************************************************************
*			Change in imports (already sum up all other imports for instrument)
*******************************************************************************************	

*Append and sum up imports
use "$clean_data_imports/c_comtrade_ch_1.dta", replace
save "$clean_data_imports/c_comtrade_iv.dta", replace

set more off
foreach country in ger es jp nz dk fi aus {
	use "$clean_data_imports/c_comtrade_`country'_1", clear
	append using "$clean_data_imports/c_comtrade_iv.dta"
	save "$clean_data_imports/c_comtrade_iv.dta", replace
}

egen group = group(sic year)
collapse (firstnm) year sic (sum) imports, by(group)
drop group
rename imports iv_imports

merge 1:1 year sic using "$clean_data_imports/c_comtrade_us_1"
*Some imports are only in some countries and some in others
replace iv_imports = 0 if mi(iv_imports)

rename imports us_imports
replace us_imports = 0 if mi(us_imports)
drop _merge

*Combine sectors that are not in manufacturing in aggregate sectors
replace sic = 100 if sic <999
replace sic = 1000 if sic>=1000&sic<=1499
replace sic = 1500 if sic>=1500&sic<=1799
replace sic = 4000 if sic>=4000&sic<=4999
replace sic = 5000 if sic>=5000&sic<=5199
replace sic = 6000 if sic>=6000&sic<=6799
replace sic = 7000 if sic>=7000&sic<=8999
drop if sic>=9000 //only interested in employment from private sector afterwards (no public administration needed)

gen industry = "agriculture" if sic == 100
replace industry = "mining" if sic == 1000
replace industry = "construction" if sic == 1500
replace industry = "manufacturing" if sic >= 2000 & sic<=3999
replace industry = "utilities" if sic == 4000
replace industry = "wholesale" if sic == 5000
replace industry = "retail" if sic == 5200
replace industry = "finance" if sic == 6000
replace industry = "services" if sic == 7000

egen group = group(sic year)
collapse (firstnm) year sic industry partner_class (sum) us_imports iv_imports, by(group)
drop group

keep year sic iv_imports us_imports partner_class
order partner_class year sic iv_imports us_imports

replace partner_class = "CHN"

save "$clean_data_imports/CHN_imports_dataset", replace

*Long-run differences
use "$clean_data_imports/CHN_imports_dataset", clear
save "$clean_data_imports/CHN_imports_dataset_d", replace

foreach start in 1993 2000 2007 {
	foreach end in 2000 2007 2014 {
	use "$clean_data_imports/CHN_imports_dataset", clear
	if `end' > `start' {
	keep if year == `start' | year == `end'
	egen id_year = group(year)
	xtset sic id_year
	gen d_us_imports_`start'_`end' = us_imports-L1.us_imports 
	gen d_iv_imports_`start'_`end' = iv_imports-L1.iv_imports 
	*If missing value, means that no product imports before of this year for this sic (hence, increase is equal to final value)
	drop if id_year == 1
	replace d_us_imports_`start'_`end' = us_imports if mi(d_us_imports_`start'_`end')
	replace d_iv_imports_`start'_`end' = iv_imports if mi(d_iv_imports_`start'_`end')
	keep partner_class sic d_us_imports_`start'_`end' d_iv_imports_`start'_`end'
	save "$clean_data_imports/d_imports_`start'_`end'", replace
	merge 1:m sic using "$clean_data_imports/CHN_imports_dataset_d"
	drop _merge
	save "$clean_data_imports/CHN_imports_dataset_d", replace
	}
	
	}
}
*If missing it means that this product in not imported in both periods (i.e. zero imports)


use "$clean_data_imports/CHN_imports_dataset_d", clear
rename sic sic87dd

egen group = group(sic)
collapse (firstnm) sic87dd partner_class d_*, by(group)
drop group

foreach var of varlist d_* {
	replace `var' = 0 if mi(`var')
}

save "$clean_data_imports/CHN_imports_dataset_d", replace

use "$clean_data_imports/CHN_imports_dataset_d", clear


************************
* Long-run differences
************************
*from local shocks regional

foreach g in "us" "iv" {
	foreach t in "1993_2000" "1993_2007" "1993_2014" "2000_2007" "2000_2014" "2007_2014" {
		local start = substr("`t'", 1, 4)
		local end = substr("`t'", 6, 4)

		* Load cleaned CZ-level data
		use "$clean_data_imports/employment_czone_industry_final", clear

		* Compute industry employment shares
		bysort czone year: egen czone_emp = total(emp)
		gen empshare = emp/czone_emp
		
		*Employment by industry over all CZ (total US)
		bysort sic87dd year: egen ind_emp = total(emp)
		
		drop emp czone_emp

		* Merge in each industry's change in import exposure
		merge m:1 sic87dd using "$clean_data_imports/CHN_imports_dataset_d", assert(1 3) 
		assert sic87dd < 2000 | sic87dd > 3999 if _merge == 1
		

		* Define non-manufacturing changes in import exposure as zero
		replace d_`g'_imports_`t' = 0 if _merge == 1
		drop _merge
		
		*Change in imports (in 1'000 of 2017 USD) per worker
		replace d_`g'_imports_`t' = ((d_`g'_imports_`t'/1000)/ind_emp)
		
		* Calculate average changes in import exposure
		collapse (mean) d_czone_imp_exp_`g'_`t' = d_`g'_imports_`t' [aw = empshare], by(czone year)

		if "`g'" == "us" {
			label var d_czone_imp_exp_`g'_`t' "Change in CZ's average import exposure, `start'-`end'"
		}
		else if "`g'" == "iv" {
			label var d_czone_imp_exp_`g'_`t' "Instrument for change in CZ's average import exposure, `start'-`end'"
		}
		
		* Save the shocks
		tempfile d_czone_imp_exp_`g'_`t'
		save "`d_czone_imp_exp_`g'_`t''.dta", replace
	}
}

* Combine the various changes in CZ import exposure into a single file
use "`d_czone_imp_exp_us_1993_2000'.dta", clear
merge 1:1 czone using "`d_czone_imp_exp_us_1993_2007'.dta", assert(3) nogenerate
merge 1:1 czone using "`d_czone_imp_exp_us_1993_2014'.dta", assert(3) nogenerate
merge 1:1 czone using "`d_czone_imp_exp_us_2000_2007'.dta", assert(3) nogenerate
merge 1:1 czone using "`d_czone_imp_exp_us_2000_2014'.dta", assert(3) nogenerate
merge 1:1 czone using "`d_czone_imp_exp_us_2007_2014'.dta", assert(3) nogenerate

merge 1:1 czone using "`d_czone_imp_exp_iv_1993_2000'.dta", assert(3) nogenerate
merge 1:1 czone using "`d_czone_imp_exp_iv_1993_2007'.dta", assert(3) nogenerate
merge 1:1 czone using "`d_czone_imp_exp_iv_1993_2014'.dta", assert(3) nogenerate
merge 1:1 czone using "`d_czone_imp_exp_iv_2000_2007'.dta", assert(3) nogenerate
merge 1:1 czone using "`d_czone_imp_exp_iv_2000_2014'.dta", assert(3) nogenerate
merge 1:1 czone using "`d_czone_imp_exp_iv_2007_2014'.dta", assert(3) nogenerate


compress
save "$final_data_imports/czone_CHN_imports_dataset_d_long", replace


*Create stacked dataset with three periods

use "$final_data_imports/czone_CHN_imports_dataset_d_long", clear

reshape long d_czone_imp_exp_, i(czone year_emp) j(temp) string
gen years = substr(temp, 4, 9)
gen country = substr(temp, 1, 2)
drop temp
gen start = substr(years, 1, 4)
gen end = substr(years, 6, 4)
destring start end, replace

save "`imports_stacked3''.dta", replace

foreach g in "us" "iv" {
	foreach t1 in "1993_2000" {
		foreach t2 in "2000_2007"{
			foreach t3 in "2007_2014" {
			use "`imports_stacked3''.dta", clear
			*keep if (years == "1993_2000" | years == "2000_2007" | years == "2007_2011") & country == "us"

			keep if (years == "`t1'" | years == "`t2'" | years == "`t3'") & country == "`g'"
			if end[1]==start[2] & end[2]==start[3] { //for example 1993-2000, 2000-2007, but not 1993-2000, 2007-2014
	
			egen group = group(czone)
			gen constant = 1
			bysort group: gen period3 = sum(constant)
			
			local period1 = substr(years[1],1,9)
			local period2 = substr(years[2],5,5)
			local period3 = substr(years[3],5,5)
			local period `period1'`period2'`period3'
			rename d_czone_imp_exp_ d3_exp_`g'_`period'

			keep czone year_emp d3_exp_`g'_`period' period3
			tempfile d_exp_`g'_`period'
			save "`d_exp_`g'_`period''.dta", replace
		}
	}
	}
	}
}
use "`d_exp_us_1993_2000_2007_2014'.dta", clear

merge 1:1 czone period3 using "`d_exp_iv_1993_2000_2007_2014'.dta", assert(3) nogenerate

compress
save "$final_data_imports/czone_CHN_imports_dataset_d_stacked3", replace

*Merge together with measure for all sectors - LONG-RUN
use "$final_data_imports/czone_CHN_imports_dataset_d_long", clear
*Rename such that consistent with automation variables
rename (d_czone_imp*1993_2000* d_czone_imp*1993_2007* d_czone_imp*1993_2014* d_czone_imp*2000_2007* d_czone_imp*2000_2014* d_czone_imp*2007_2014*) (d_czone_imp*93_00* d_czone_imp*93_07* d_czone_imp*93_14* d_czone_imp*00_07* d_czone_imp*00_14* d_czone_imp*07_14*)
save "$final_data_imports/czone_ExposureChinaLong", replace


*3 Periods (1993-2000-2007-2014)
use "$final_data_imports/czone_CHN_imports_dataset_d_stacked3", clear

gen year = 1990 if period3 == 1
replace year = 2000 if period3 == 2
replace year = 2008 if period3 == 3

rename (d3_exp_us_1993_2000_2007_2014 d3_exp_iv_1993_2000_2007_2014) (d_exp_us_stacked d_exp_iv_stacked)
keep czone year d_exp_us_stacked d_exp_iv_stacked

save "$final_data_imports/czone_ExposureChina.dta", replace
cap rm "`imports_stacked3''.dta"
